ESL Dataset
load("../data/mixture.example.RData")
intercept = 50.0
coef_x1 = 10.0
coef_x2 = -2.0
set.seed(1000)
df <- data.frame(x1 = mixture.example$x[,1],
x2 = mixture.example$x[,2]) %>%
mutate(y = intercept + coef_x1 * x1 + coef_x2 * x2 + rnorm(length(x1), 0, 5))
x.grid <- seq(min(df$x1), max(df$x1), 0.1)
y.grid <- seq(min(df$x2), max(df$x2), 0.1)
hist(df$y)

Figure (just the dataset)
xy.grid <- expand.grid(x.grid, y.grid)
names(xy.grid) <- c("x1", "x2")
p <- ggplot(df) +
geom_point(aes(x = x1, y = x2, colour = y, fill = y), pch = 21) +
geom_point(aes(x = x1, y = x2), alpha = 0.5, data = xy.grid, colour = "gray50", size = 0.02) +
theme_minimal() +
xlab("Disease Severity Score (x1)") +
ylab("Social Determinants Score (x2)") +
theme(axis.text = element_blank()) +
theme(legend.position = "bottom", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
scale_colour_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") +
scale_fill_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d")
print(p)
ggsave(p, filename = "../img/esl-reg-just-data.png", height=4.5, width=4, units="in", dpi=300)

Linear regression
x.grid <- seq(min(df$x1), max(df$x1), 0.05)
y.grid <- seq(min(df$x2), max(df$x2), 0.05)
xy.grid <- expand.grid(x.grid, y.grid)
names(xy.grid) <- c("x1", "x2")
m <- lm(y ~ x1 + x2, data = df)
xy.grid$yhat <- predict(m, xy.grid)
p <- ggplot(df) +
geom_point(aes(x = x1, y = x2, colour = y, fill = y), pch = 21) +
geom_point(aes(x = x1, y = x2, colour = yhat), alpha = 0.4, data = xy.grid, size = 0.5) +
theme_minimal() +
xlab("Disease Severity Score (x1)") +
ylab("Social Determinants Score (x2)") + theme(axis.text = element_blank()) +
scale_fill_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") +
stat_contour(aes(x = x1, y = x2, z = yhat), breaks = quantile(xy.grid$yhat, seq(0, 1, 0.25)),
data = xy.grid, colour = "gray30") +
scale_colour_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") +
theme(legend.position = "none", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
coord_cartesian(xlim=c(min(xy.grid$x1),max(xy.grid$x1)), ylim=c(min(xy.grid$x2),max(xy.grid$x2)))
print(p)
ggsave(p, filename = "../img/esl-reg-linear.png", height=4, width=4, units="in", dpi=300)

KNN with K=15
x.grid <- seq(min(df$x1), max(df$x1), 0.05)
y.grid <- seq(min(df$x2), max(df$x2), 0.05)
xy.grid <- expand.grid(x.grid, y.grid)
names(xy.grid) <- c("x1", "x2")
m <- knn(df[,1:2], xy.grid, df[,3], k=15, prob=TRUE)
xy.grid$yhat <- as.numeric(as.character(m))
p <- ggplot(df) +
geom_point(aes(x = x1, y = x2, colour = y, fill = y), pch = 21) +
geom_point(aes(x = x1, y = x2, colour = yhat), alpha = 0.4, data = xy.grid, size = 0.5) +
theme_minimal() +
xlab("Disease Severity Score (x1)") +
ylab("Social Determinants Score (x2)") + theme(axis.text = element_blank()) +
scale_fill_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") +
stat_contour(aes(x = x1, y = x2, z = yhat), breaks = quantile(xy.grid$yhat, seq(0, 1, 0.25)),
data = xy.grid, colour = "gray30") +
scale_colour_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") +
theme(legend.position = "none", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
coord_cartesian(xlim=c(min(xy.grid$x1),max(xy.grid$x1)), ylim=c(min(xy.grid$x2),max(xy.grid$x2)))
print(p)
ggsave(p, filename = "../img/esl-reg-knn-15.png", height=4, width=4, units="in", dpi=300)

Decision tree
x.grid <- seq(min(df$x1), max(df$x1), 0.05)
y.grid <- seq(min(df$x2), max(df$x2), 0.05)
xy.grid <- expand.grid(x.grid, y.grid)
names(xy.grid) <- c("x1", "x2")
m <- rpart(y ~ x1 + x2, data = df)
xy.grid$yhat <- predict(m, xy.grid)
p <- ggplot(df) +
geom_point(aes(x = x1, y = x2, colour = y, fill = y), pch = 21) +
geom_point(aes(x = x1, y = x2, colour = yhat), alpha = 0.4, data = xy.grid, size = 0.5) +
theme_minimal() +
xlab("Disease Severity Score (x1)") +
ylab("Social Determinants Score (x2)") + theme(axis.text = element_blank()) +
scale_fill_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") +
stat_contour(aes(x = x1, y = x2, z = yhat), breaks = quantile(xy.grid$yhat, seq(0, 1, 0.25)),
data = xy.grid, colour = "gray30") +
scale_colour_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") +
theme(legend.position = "none", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
coord_cartesian(xlim=c(min(xy.grid$x1),max(xy.grid$x1)), ylim=c(min(xy.grid$x2),max(xy.grid$x2)))
print(p)
ggsave(p, filename = "../img/esl-reg-decision-tree.png", height=4, width=4, units="in", dpi=300)

Standard deviation reduction!
set.seed(200)
df <- data.frame(y = c(rnorm(100, 0, 1), rnorm(100, 3, 1)),
gauss_id = c(rep(0.1, 100), rep(0.9, 100)),
x1 = rbinom(200, 1, 0.5)) %>%
mutate(x2 = rbinom(length(x1), 1, gauss_id)) %>%
gather(x1:x2, key = "variable", value = "value")
p <- ggplot(df) +
geom_histogram(aes(x = y, fill = as.factor(value)), position = "stack", alpha = 0.5, bins = 30) +
theme_bw() + scale_fill_discrete(name = "Variable Value") +
facet_wrap(~variable, nrow = 2) + ylab("Count") + xlab("Outcome (y)")
print(p)
ggsave(p, filename = "../img/esl-reg-decision-tree-varsplit.png", height=4, width=6, units="in", dpi=300)

---
title: "Regression"
output: html_notebook
---

## Libraries

```{r setup}
library(ggplot2)
library(scales)
library(dplyr)
library(survival)
library(survminer)
library(lubridate)
library(tidyr)
library(stringr)
library(forcats)
library(wesanderson)
library(class)
library(rpart)
library(rpart.plot)
library(plotly)
```

## ESL Dataset

```{r}
load("../data/mixture.example.RData")
intercept = 50.0
coef_x1 = 10.0
coef_x2 = -2.0
set.seed(1000)
df <- data.frame(x1 = mixture.example$x[,1], 
                 x2 = mixture.example$x[,2]) %>%
  mutate(y = intercept + coef_x1 * x1 + coef_x2 * x2 + rnorm(length(x1), 0, 5))
x.grid <- seq(min(df$x1), max(df$x1), 0.1)
y.grid <- seq(min(df$x2), max(df$x2), 0.1)
hist(df$y)
```

Figure (just the dataset)

```{r}
xy.grid <- expand.grid(x.grid, y.grid)
names(xy.grid) <- c("x1", "x2")
p <- ggplot(df) + 
  geom_point(aes(x = x1, y = x2, colour = y, fill = y), pch = 21) + 
  geom_point(aes(x = x1, y = x2), alpha = 0.5, data = xy.grid, colour = "gray50", size = 0.02) + 
  theme_minimal() + 
  xlab("Disease Severity Score (x1)") + 
  ylab("Social Determinants Score (x2)") + 
  theme(axis.text = element_blank()) + 
  theme(legend.position = "bottom", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
  scale_colour_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") + 
  scale_fill_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d")
print(p)
ggsave(p, filename = "../img/esl-reg-just-data.png", height=4.5, width=4, units="in", dpi=300)
```

Linear regression

```{r}
x.grid <- seq(min(df$x1), max(df$x1), 0.05)
y.grid <- seq(min(df$x2), max(df$x2), 0.05)
xy.grid <- expand.grid(x.grid, y.grid)
names(xy.grid) <- c("x1", "x2")
m <- lm(y ~ x1 + x2, data = df)
xy.grid$yhat <- predict(m, xy.grid)
p <- ggplot(df) + 
  geom_point(aes(x = x1, y = x2, colour = y, fill = y), pch = 21) + 
  geom_point(aes(x = x1, y = x2, colour = yhat), alpha = 0.4, data = xy.grid, size = 0.5) + 
  theme_minimal() + 
  xlab("Disease Severity Score (x1)") + 
  ylab("Social Determinants Score (x2)") + theme(axis.text = element_blank()) + 
  scale_fill_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") +
  stat_contour(aes(x = x1, y = x2, z = yhat), breaks = quantile(xy.grid$yhat, seq(0, 1, 0.25)), 
               data = xy.grid, colour = "gray30") + 
  scale_colour_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") + 
  theme(legend.position = "none", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
  coord_cartesian(xlim=c(min(xy.grid$x1),max(xy.grid$x1)), ylim=c(min(xy.grid$x2),max(xy.grid$x2)))
print(p)
ggsave(p, filename = "../img/esl-reg-linear.png", height=4, width=4, units="in", dpi=300)
```

KNN with K=15

```{r}
x.grid <- seq(min(df$x1), max(df$x1), 0.05)
y.grid <- seq(min(df$x2), max(df$x2), 0.05)
xy.grid <- expand.grid(x.grid, y.grid)
names(xy.grid) <- c("x1", "x2")
m <- knn(df[,1:2], xy.grid, df[,3], k=15, prob=TRUE)
xy.grid$yhat <- as.numeric(as.character(m))
p <- ggplot(df) + 
  geom_point(aes(x = x1, y = x2, colour = y, fill = y), pch = 21) + 
  geom_point(aes(x = x1, y = x2, colour = yhat), alpha = 0.4, data = xy.grid, size = 0.5) + 
  theme_minimal() + 
  xlab("Disease Severity Score (x1)") + 
  ylab("Social Determinants Score (x2)") + theme(axis.text = element_blank()) + 
  scale_fill_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") +
  stat_contour(aes(x = x1, y = x2, z = yhat), breaks = quantile(xy.grid$yhat, seq(0, 1, 0.25)), 
               data = xy.grid, colour = "gray30") + 
  scale_colour_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") + 
  theme(legend.position = "none", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
  coord_cartesian(xlim=c(min(xy.grid$x1),max(xy.grid$x1)), ylim=c(min(xy.grid$x2),max(xy.grid$x2)))
print(p)
ggsave(p, filename = "../img/esl-reg-knn-15.png", height=4, width=4, units="in", dpi=300)
```

Decision tree

```{r}
x.grid <- seq(min(df$x1), max(df$x1), 0.05)
y.grid <- seq(min(df$x2), max(df$x2), 0.05)
xy.grid <- expand.grid(x.grid, y.grid)
names(xy.grid) <- c("x1", "x2")
m <- rpart(y ~ x1 + x2, data = df)
xy.grid$yhat <- predict(m, xy.grid)
p <- ggplot(df) + 
  geom_point(aes(x = x1, y = x2, colour = y, fill = y), pch = 21) + 
  geom_point(aes(x = x1, y = x2, colour = yhat), alpha = 0.4, data = xy.grid, size = 0.5) + 
  theme_minimal() + 
  xlab("Disease Severity Score (x1)") + 
  ylab("Social Determinants Score (x2)") + theme(axis.text = element_blank()) + 
  scale_fill_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") +
  stat_contour(aes(x = x1, y = x2, z = yhat), breaks = quantile(xy.grid$yhat, seq(0, 1, 0.25)), 
               data = xy.grid, colour = "gray30") + 
  scale_colour_gradient(name = "Recurrence\nBiomarker (y)", low = "#00bfc4", high = "#f8766d") + 
  theme(legend.position = "none", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
  coord_cartesian(xlim=c(min(xy.grid$x1),max(xy.grid$x1)), ylim=c(min(xy.grid$x2),max(xy.grid$x2)))
print(p)
ggsave(p, filename = "../img/esl-reg-decision-tree.png", height=4, width=4, units="in", dpi=300)
```

Standard deviation reduction!

```{r}
set.seed(200)
df <- data.frame(y = c(rnorm(100, 0, 1), rnorm(100, 3, 1)), 
                 gauss_id = c(rep(0.1, 100), rep(0.9, 100)),
                 x1 = rbinom(200, 1, 0.5)) %>%
  mutate(x2 = rbinom(length(x1), 1, gauss_id)) %>%
  gather(x1:x2, key = "variable", value = "value")

p <- ggplot(df) + 
  geom_histogram(aes(x = y, fill = as.factor(value)), position = "stack", alpha = 0.5, bins = 30) + 
  theme_bw() + scale_fill_discrete(name = "Variable Value") + 
  facet_wrap(~variable, nrow = 2) + ylab("Count") + xlab("Outcome (y)")
print(p)
ggsave(p, filename = "../img/esl-reg-decision-tree-varsplit.png", height=4, width=6, units="in", dpi=300)
```


